1   package org.apache.lucene.search.suggest.analyzing;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.io.IOException;
21  import java.io.Reader;
22  import java.util.ArrayList;
23  import java.util.Arrays;
24  import java.util.Collections;
25  import java.util.Comparator;
26  import java.util.HashSet;
27  import java.util.List;
28  import java.util.Set;
29  import java.util.TreeSet;
30  
31  import org.apache.lucene.analysis.Analyzer;
32  import org.apache.lucene.analysis.CannedTokenStream;
33  import org.apache.lucene.analysis.MockAnalyzer;
34  import org.apache.lucene.analysis.MockTokenFilter;
35  import org.apache.lucene.analysis.MockTokenizer;
36  import org.apache.lucene.analysis.Token;
37  import org.apache.lucene.analysis.TokenFilter;
38  import org.apache.lucene.analysis.TokenStream;
39  import org.apache.lucene.analysis.TokenStreamToAutomaton;
40  import org.apache.lucene.analysis.Tokenizer;
41  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
42  import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
43  import org.apache.lucene.search.suggest.Input;
44  import org.apache.lucene.search.suggest.InputArrayIterator;
45  import org.apache.lucene.search.suggest.Lookup.LookupResult;
46  import org.apache.lucene.util.BytesRef;
47  import org.apache.lucene.util.BytesRefBuilder;
48  import org.apache.lucene.util.IntsRef;
49  import org.apache.lucene.util.LuceneTestCase;
50  import org.apache.lucene.util.TestUtil;
51  import org.apache.lucene.util.automaton.Automaton;
52  import org.apache.lucene.util.automaton.FiniteStringsIterator;
53  import org.apache.lucene.util.fst.Util;
54  
55  public class FuzzySuggesterTest extends LuceneTestCase {
56    
57    public void testRandomEdits() throws IOException {
58      List<Input> keys = new ArrayList<>();
59      int numTerms = atLeast(100);
60      for (int i = 0; i < numTerms; i++) {
61        keys.add(new Input("boo" + TestUtil.randomSimpleString(random()), 1 + random().nextInt(100)));
62      }
63      keys.add(new Input("foo bar boo far", 12));
64      MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
65      FuzzySuggester suggester = new FuzzySuggester(analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP, 256, -1, true, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS,
66                                                    0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH, FuzzySuggester.DEFAULT_UNICODE_AWARE);
67      suggester.build(new InputArrayIterator(keys));
68      int numIters = atLeast(10);
69      for (int i = 0; i < numIters; i++) {
70        String addRandomEdit = addRandomEdit("foo bar boo", FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX);
71        List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2);
72        assertEquals(addRandomEdit, 1, results.size());
73        assertEquals("foo bar boo far", results.get(0).key.toString());
74        assertEquals(12, results.get(0).value, 0.01F);  
75      }
76      analyzer.close();
77    }
78    
79    public void testNonLatinRandomEdits() throws IOException {
80      List<Input> keys = new ArrayList<>();
81      int numTerms = atLeast(100);
82      for (int i = 0; i < numTerms; i++) {
83        keys.add(new Input("буу" + TestUtil.randomSimpleString(random()), 1 + random().nextInt(100)));
84      }
85      keys.add(new Input("фуу бар буу фар", 12));
86      MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
87      FuzzySuggester suggester = new FuzzySuggester(analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP, 256, -1, true, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS,
88          0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH, true);
89      suggester.build(new InputArrayIterator(keys));
90      int numIters = atLeast(10);
91      for (int i = 0; i < numIters; i++) {
92        String addRandomEdit = addRandomEdit("фуу бар буу", 0);
93        List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2);
94        assertEquals(addRandomEdit, 1, results.size());
95        assertEquals("фуу бар буу фар", results.get(0).key.toString());
96        assertEquals(12, results.get(0).value, 0.01F);
97      }
98      analyzer.close();
99    }
100 
101   /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */
102   public void testKeyword() throws Exception {
103     Input keys[] = new Input[] {
104         new Input("foo", 50),
105         new Input("bar", 10),
106         new Input("barbar", 12),
107         new Input("barbara", 6)
108     };
109     
110     Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
111     FuzzySuggester suggester = new FuzzySuggester(analyzer);
112     suggester.build(new InputArrayIterator(keys));
113     
114     List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("bariar", random()), false, 2);
115     assertEquals(2, results.size());
116     assertEquals("barbar", results.get(0).key.toString());
117     assertEquals(12, results.get(0).value, 0.01F);
118     
119     results = suggester.lookup(TestUtil.stringToCharSequence("barbr", random()), false, 2);
120     assertEquals(2, results.size());
121     assertEquals("barbar", results.get(0).key.toString());
122     assertEquals(12, results.get(0).value, 0.01F);
123     
124     results = suggester.lookup(TestUtil.stringToCharSequence("barbara", random()), false, 2);
125     assertEquals(2, results.size());
126     assertEquals("barbara", results.get(0).key.toString());
127     assertEquals(6, results.get(0).value, 0.01F);
128     
129     results = suggester.lookup(TestUtil.stringToCharSequence("barbar", random()), false, 2);
130     assertEquals(2, results.size());
131     assertEquals("barbar", results.get(0).key.toString());
132     assertEquals(12, results.get(0).value, 0.01F);
133     assertEquals("barbara", results.get(1).key.toString());
134     assertEquals(6, results.get(1).value, 0.01F);
135     
136     results = suggester.lookup(TestUtil.stringToCharSequence("barbaa", random()), false, 2);
137     assertEquals(2, results.size());
138     assertEquals("barbar", results.get(0).key.toString());
139     assertEquals(12, results.get(0).value, 0.01F);
140     assertEquals("barbara", results.get(1).key.toString());
141     assertEquals(6, results.get(1).value, 0.01F);
142     
143     // top N of 2, but only foo is available
144     results = suggester.lookup(TestUtil.stringToCharSequence("f", random()), false, 2);
145     assertEquals(1, results.size());
146     assertEquals("foo", results.get(0).key.toString());
147     assertEquals(50, results.get(0).value, 0.01F);
148     
149     // top N of 1 for 'bar': we return this even though
150     // barbar is higher because exactFirst is enabled:
151     results = suggester.lookup(TestUtil.stringToCharSequence("bar", random()), false, 1);
152     assertEquals(1, results.size());
153     assertEquals("bar", results.get(0).key.toString());
154     assertEquals(10, results.get(0).value, 0.01F);
155     
156     // top N Of 2 for 'b'
157     results = suggester.lookup(TestUtil.stringToCharSequence("b", random()), false, 2);
158     assertEquals(2, results.size());
159     assertEquals("barbar", results.get(0).key.toString());
160     assertEquals(12, results.get(0).value, 0.01F);
161     assertEquals("bar", results.get(1).key.toString());
162     assertEquals(10, results.get(1).value, 0.01F);
163     
164     // top N of 3 for 'ba'
165     results = suggester.lookup(TestUtil.stringToCharSequence("ba", random()), false, 3);
166     assertEquals(3, results.size());
167     assertEquals("barbar", results.get(0).key.toString());
168     assertEquals(12, results.get(0).value, 0.01F);
169     assertEquals("bar", results.get(1).key.toString());
170     assertEquals(10, results.get(1).value, 0.01F);
171     assertEquals("barbara", results.get(2).key.toString());
172     assertEquals(6, results.get(2).value, 0.01F);
173     
174     analyzer.close();
175   }
176   
177   /**
178    * basic "standardanalyzer" test with stopword removal
179    */
180   public void testStandard() throws Exception {
181     Input keys[] = new Input[] {
182         new Input("the ghost of christmas past", 50),
183     };
184     
185     Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
186     FuzzySuggester suggester = new FuzzySuggester(standard, standard, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, false, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS,
187         FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH, FuzzySuggester.DEFAULT_UNICODE_AWARE);
188     suggester.build(new InputArrayIterator(keys));
189     
190     List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);
191     assertEquals(1, results.size());
192     assertEquals("the ghost of christmas past", results.get(0).key.toString());
193     assertEquals(50, results.get(0).value, 0.01F);
194 
195     // omit the 'the' since it's a stopword, it's suggested anyway
196     results = suggester.lookup(TestUtil.stringToCharSequence("ghost of chris", random()), false, 1);
197     assertEquals(1, results.size());
198     assertEquals("the ghost of christmas past", results.get(0).key.toString());
199     assertEquals(50, results.get(0).value, 0.01F);
200 
201     // omit the 'the' and 'of' since they are stopwords, it's suggested anyway
202     results = suggester.lookup(TestUtil.stringToCharSequence("ghost chris", random()), false, 1);
203     assertEquals(1, results.size());
204     assertEquals("the ghost of christmas past", results.get(0).key.toString());
205     assertEquals(50, results.get(0).value, 0.01F);
206     
207     standard.close();
208   }
209 
210   public void testNoSeps() throws Exception {
211     Input[] keys = new Input[] {
212       new Input("ab cd", 0),
213       new Input("abcd", 1),
214     };
215 
216     int options = 0;
217 
218     Analyzer a = new MockAnalyzer(random());
219     FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, true, 1, true, 1, 3, false);
220     suggester.build(new InputArrayIterator(keys));
221     // TODO: would be nice if "ab " would allow the test to
222     // pass, and more generally if the analyzer can know
223     // that the user's current query has ended at a word, 
224     // but, analyzers don't produce SEP tokens!
225     List<LookupResult> r = suggester.lookup(TestUtil.stringToCharSequence("ab c", random()), false, 2);
226     assertEquals(2, r.size());
227 
228     // With no PRESERVE_SEPS specified, "ab c" should also
229     // complete to "abcd", which has higher weight so should
230     // appear first:
231     assertEquals("abcd", r.get(0).key.toString());
232     a.close();
233   }
234 
235   public void testGraphDups() throws Exception {
236 
237     final Analyzer analyzer = new Analyzer() {
238       @Override
239       protected TokenStreamComponents createComponents(String fieldName) {
240         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
241         
242         return new TokenStreamComponents(tokenizer) {
243           int tokenStreamCounter = 0;
244           final TokenStream[] tokenStreams = new TokenStream[] {
245             new CannedTokenStream(new Token[] {
246                 token("wifi",1,1),
247                 token("hotspot",0,2),
248                 token("network",1,1),
249                 token("is",1,1),
250                 token("slow",1,1)
251               }),
252             new CannedTokenStream(new Token[] {
253                 token("wi",1,1),
254                 token("hotspot",0,3),
255                 token("fi",1,1),
256                 token("network",1,1),
257                 token("is",1,1),
258                 token("fast",1,1)
259 
260               }),
261             new CannedTokenStream(new Token[] {
262                 token("wifi",1,1),
263                 token("hotspot",0,2),
264                 token("network",1,1)
265               }),
266           };
267 
268           @Override
269           public TokenStream getTokenStream() {
270             TokenStream result = tokenStreams[tokenStreamCounter];
271             tokenStreamCounter++;
272             return result;
273           }
274          
275           @Override
276           protected void setReader(final Reader reader) {
277           }
278         };
279       }
280     };
281 
282     Input keys[] = new Input[] {
283         new Input("wifi network is slow", 50),
284         new Input("wi fi network is fast", 10),
285     };
286     FuzzySuggester suggester = new FuzzySuggester(analyzer);
287     suggester.build(new InputArrayIterator(keys));
288     
289     List<LookupResult> results = suggester.lookup("wifi network", false, 10);
290     if (VERBOSE) {
291       System.out.println("Results: " + results);
292     }
293     assertEquals(2, results.size());
294     assertEquals("wifi network is slow", results.get(0).key);
295     assertEquals(50, results.get(0).value);
296     assertEquals("wi fi network is fast", results.get(1).key);
297     assertEquals(10, results.get(1).value);
298     analyzer.close();
299   }
300 
301   public void testEmpty() throws Exception {
302     Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
303     FuzzySuggester suggester = new FuzzySuggester(analyzer);
304     suggester.build(new InputArrayIterator(new Input[0]));
305 
306     List<LookupResult> result = suggester.lookup("a", false, 20);
307     assertTrue(result.isEmpty());
308     analyzer.close();
309   }
310 
311   public void testInputPathRequired() throws Exception {
312 
313     //  SynonymMap.Builder b = new SynonymMap.Builder(false);
314     //  b.add(new CharsRef("ab"), new CharsRef("ba"), true);
315     //  final SynonymMap map = b.build();
316 
317     //  The Analyzer below mimics the functionality of the SynonymAnalyzer
318     //  using the above map, so that the suggest module does not need a dependency on the 
319     //  synonym module 
320 
321     final Analyzer analyzer = new Analyzer() {
322       @Override
323       protected TokenStreamComponents createComponents(String fieldName) {
324         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
325         
326         return new TokenStreamComponents(tokenizer) {
327           int tokenStreamCounter = 0;
328           final TokenStream[] tokenStreams = new TokenStream[] {
329             new CannedTokenStream(new Token[] {
330                 token("ab",1,1),
331                 token("ba",0,1),
332                 token("xc",1,1)
333               }),
334             new CannedTokenStream(new Token[] {
335                 token("ba",1,1),          
336                 token("xd",1,1)
337               }),
338             new CannedTokenStream(new Token[] {
339                 token("ab",1,1),
340                 token("ba",0,1),
341                 token("x",1,1)
342               })
343           };
344 
345           @Override
346           public TokenStream getTokenStream() {
347             TokenStream result = tokenStreams[tokenStreamCounter];
348             tokenStreamCounter++;
349             return result;
350           }
351          
352           @Override
353           protected void setReader(final Reader reader) {
354           }
355         };
356       }
357     };
358 
359     Input keys[] = new Input[] {
360         new Input("ab xc", 50),
361         new Input("ba xd", 50),
362     };
363     FuzzySuggester suggester = new FuzzySuggester(analyzer);
364     suggester.build(new InputArrayIterator(keys));
365     List<LookupResult> results = suggester.lookup("ab x", false, 1);
366     assertTrue(results.size() == 1);
367     analyzer.close();
368   }
369 
370   private static Token token(String term, int posInc, int posLength) {
371     final Token t = new Token(term, 0, 0);
372     t.setPositionIncrement(posInc);
373     t.setPositionLength(posLength);
374     return t;
375   }
376 
377   /*
378   private void printTokens(final Analyzer analyzer, String input) throws IOException {
379     System.out.println("Tokens for " + input);
380     TokenStream ts = analyzer.tokenStream("", new StringReader(input));
381     ts.reset();
382     final TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
383     final PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
384     final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class);
385     
386     while(ts.incrementToken()) {
387       termBytesAtt.fillBytesRef();
388       System.out.println(String.format("%s,%s,%s", termBytesAtt.getBytesRef().utf8ToString(), posIncAtt.getPositionIncrement(), posLengthAtt.getPositionLength()));      
389     }
390     ts.end();
391     ts.close();
392   } 
393   */ 
394 
395   private final Analyzer getUnusualAnalyzer() {
396     return new Analyzer() {
397       @Override
398       protected TokenStreamComponents createComponents(String fieldName) {
399         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
400         
401         return new TokenStreamComponents(tokenizer) {
402 
403           int count;
404 
405           @Override
406           public TokenStream getTokenStream() {
407             // 4th time we are called, return tokens a b,
408             // else just a:
409             if (count++ != 3) {
410               return new CannedTokenStream(new Token[] {
411                   token("a", 1, 1),
412                 });
413             } else {
414               // After that "a b":
415               return new CannedTokenStream(new Token[] {
416                   token("a", 1, 1),
417                   token("b", 1, 1),
418                 });
419             }
420           }
421          
422           @Override
423           protected void setReader(final Reader reader) {
424           }
425         };
426       }
427     };
428   }
429 
430   public void testExactFirst() throws Exception {
431 
432     Analyzer a = getUnusualAnalyzer();
433     FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, true, 1, true, 1, 3, false);
434     suggester.build(new InputArrayIterator(new Input[] {
435           new Input("x y", 1),
436           new Input("x y z", 3),
437           new Input("x", 2),
438           new Input("z z z", 20),
439         }));
440 
441     //System.out.println("ALL: " + suggester.lookup("x y", false, 6));
442 
443     for(int topN=1;topN<6;topN++) {
444       List<LookupResult> results = suggester.lookup("x y", false, topN);
445       //System.out.println("topN=" + topN + " " + results);
446 
447       assertEquals(Math.min(topN, 4), results.size());
448 
449       assertEquals("x y", results.get(0).key);
450       assertEquals(1, results.get(0).value);
451 
452       if (topN > 1) {
453         assertEquals("z z z", results.get(1).key);
454         assertEquals(20, results.get(1).value);
455 
456         if (topN > 2) {
457           assertEquals("x y z", results.get(2).key);
458           assertEquals(3, results.get(2).value);
459 
460           if (topN > 3) {
461             assertEquals("x", results.get(3).key);
462             assertEquals(2, results.get(3).value);
463           }
464         }
465       }
466     }
467     a.close();
468   }
469 
470   public void testNonExactFirst() throws Exception {
471 
472     Analyzer a = getUnusualAnalyzer();
473     FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, true, 1, true, 1, 3, false);
474 
475     suggester.build(new InputArrayIterator(new Input[] {
476           new Input("x y", 1),
477           new Input("x y z", 3),
478           new Input("x", 2),
479           new Input("z z z", 20),
480         }));
481 
482     for(int topN=1;topN<6;topN++) {
483       List<LookupResult> results = suggester.lookup("p", false, topN);
484 
485       assertEquals(Math.min(topN, 4), results.size());
486 
487       assertEquals("z z z", results.get(0).key);
488       assertEquals(20, results.get(0).value);
489 
490       if (topN > 1) {
491         assertEquals("x y z", results.get(1).key);
492         assertEquals(3, results.get(1).value);
493 
494         if (topN > 2) {
495           assertEquals("x", results.get(2).key);
496           assertEquals(2, results.get(2).value);
497           
498           if (topN > 3) {
499             assertEquals("x y", results.get(3).key);
500             assertEquals(1, results.get(3).value);
501           }
502         }
503       }
504     }
505     a.close();
506   }
507   
508   // Holds surface form separately:
509   private static class TermFreqPayload2 implements Comparable<TermFreqPayload2> {
510     public final String surfaceForm;
511     public final String analyzedForm;
512     public final long weight;
513 
514     public TermFreqPayload2(String surfaceForm, String analyzedForm, long weight) {
515       this.surfaceForm = surfaceForm;
516       this.analyzedForm = analyzedForm;
517       this.weight = weight;
518     }
519 
520     @Override
521     public int compareTo(TermFreqPayload2 other) {
522       int cmp = analyzedForm.compareTo(other.analyzedForm);
523       if (cmp != 0) {
524         return cmp;
525       } else if (weight > other.weight) {
526         return -1;
527       } else if (weight < other.weight) {
528         return 1;
529       } else {
530         assert false;
531         return 0;
532       }
533     }
534   }
535 
536   static boolean isStopChar(char ch, int numStopChars) {
537     //System.out.println("IS? " + ch + ": " + (ch - 'a') + ": " + ((ch - 'a') < numStopChars));
538     return (ch - 'a') < numStopChars;
539   }
540 
541   // Like StopFilter:
542   private static class TokenEater extends TokenFilter {
543     private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
544     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
545     private final int numStopChars;
546     private final boolean preserveHoles;
547     private boolean first;
548 
549     public TokenEater(boolean preserveHoles, TokenStream in, int numStopChars) {
550       super(in);
551       this.preserveHoles = preserveHoles;
552       this.numStopChars = numStopChars;
553     }
554 
555     @Override
556     public void reset() throws IOException {
557       super.reset();
558       first = true;
559     }
560 
561     @Override
562     public final boolean incrementToken() throws IOException {
563       int skippedPositions = 0;
564       while (input.incrementToken()) {
565         if (termAtt.length() != 1 || !isStopChar(termAtt.charAt(0), numStopChars)) {
566           int posInc = posIncrAtt.getPositionIncrement() + skippedPositions;
567           if (first) {
568             if (posInc == 0) {
569               // first token having posinc=0 is illegal.
570               posInc = 1;
571             }
572             first = false;
573           }
574           posIncrAtt.setPositionIncrement(posInc);
575           //System.out.println("RETURN term=" + termAtt + " numStopChars=" + numStopChars);
576           return true;
577         }
578         if (preserveHoles) {
579           skippedPositions += posIncrAtt.getPositionIncrement();
580         }
581       }
582 
583       return false;
584     }
585   }
586 
587   private static class MockTokenEatingAnalyzer extends Analyzer {
588     private int numStopChars;
589     private boolean preserveHoles;
590 
591     public MockTokenEatingAnalyzer(int numStopChars, boolean preserveHoles) {
592       this.preserveHoles = preserveHoles;
593       this.numStopChars = numStopChars;
594     }
595 
596     @Override
597     public TokenStreamComponents createComponents(String fieldName) {
598       MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
599       tokenizer.setEnableChecks(true);
600       TokenStream next;
601       if (numStopChars != 0) {
602         next = new TokenEater(preserveHoles, tokenizer, numStopChars);
603       } else {
604         next = tokenizer;
605       }
606       return new TokenStreamComponents(tokenizer, next);
607     }
608   }
609 
610   public void testRandom() throws Exception {
611 
612     int numQueries = atLeast(100);
613     
614     final List<TermFreqPayload2> slowCompletor = new ArrayList<>();
615     final TreeSet<String> allPrefixes = new TreeSet<>();
616     final Set<String> seen = new HashSet<>();
617     
618     Input[] keys = new Input[numQueries];
619 
620     boolean preserveSep = random().nextBoolean();
621     boolean unicodeAware = random().nextBoolean();
622 
623     final int numStopChars = random().nextInt(10);
624     final boolean preserveHoles = random().nextBoolean();
625 
626     if (VERBOSE) {
627       System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " ; unicodeAware=" + unicodeAware + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles);
628     }
629     
630     for (int i = 0; i < numQueries; i++) {
631       int numTokens = TestUtil.nextInt(random(), 1, 4);
632       String key;
633       String analyzedKey;
634       while(true) {
635         key = "";
636         analyzedKey = "";
637         boolean lastRemoved = false;
638         for(int token=0;token < numTokens;token++) {
639           String s;
640           while (true) {
641             // TODO: would be nice to fix this slowCompletor/comparator to
642             // use full range, but we might lose some coverage too...
643             s = TestUtil.randomSimpleString(random());
644             if (s.length() > 0) {
645               if (token > 0) {
646                 key += " ";
647               }
648               if (preserveSep && analyzedKey.length() > 0 && (unicodeAware ? analyzedKey.codePointAt(analyzedKey.codePointCount(0, analyzedKey.length())-1) != ' ' : analyzedKey.charAt(analyzedKey.length()-1) != ' ')) {
649                 analyzedKey += " ";
650               }
651               key += s;
652               if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {
653                 if (preserveSep && preserveHoles) {
654                   analyzedKey += '\u0000';
655                 }
656                 lastRemoved = true;
657               } else {
658                 analyzedKey += s;
659                 lastRemoved = false;
660               }
661               break;
662             }
663           }
664         }
665 
666         analyzedKey = analyzedKey.replaceAll("(^| )\u0000$", "");
667 
668         if (preserveSep && lastRemoved) {
669           analyzedKey += " ";
670         }
671 
672         // Don't add same surface form more than once:
673         if (!seen.contains(key)) {
674           seen.add(key);
675           break;
676         }
677       }
678 
679       for (int j = 1; j < key.length(); j++) {
680         allPrefixes.add(key.substring(0, j));
681       }
682       // we can probably do Integer.MAX_VALUE here, but why worry.
683       int weight = random().nextInt(1<<24);
684       keys[i] = new Input(key, weight);
685 
686       slowCompletor.add(new TermFreqPayload2(key, analyzedKey, weight));
687     }
688 
689     if (VERBOSE) {
690       // Don't just sort original list, to avoid VERBOSE
691       // altering the test:
692       List<TermFreqPayload2> sorted = new ArrayList<>(slowCompletor);
693       Collections.sort(sorted);
694       for(TermFreqPayload2 ent : sorted) {
695         System.out.println("  surface='" + ent.surfaceForm + " analyzed='" + ent.analyzedForm + "' weight=" + ent.weight);
696       }
697     }
698 
699     Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);
700     FuzzySuggester suggester = new FuzzySuggester(a, a,
701                                                   preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, true, 1, false, 1, 3, unicodeAware);
702     suggester.build(new InputArrayIterator(keys));
703 
704     for (String prefix : allPrefixes) {
705 
706       if (VERBOSE) {
707         System.out.println("\nTEST: prefix=" + prefix);
708       }
709 
710       final int topN = TestUtil.nextInt(random(), 1, 10);
711       List<LookupResult> r = suggester.lookup(TestUtil.stringToCharSequence(prefix, random()), false, topN);
712 
713       // 2. go thru whole set to find suggestions:
714       List<LookupResult> matches = new ArrayList<>();
715 
716       // "Analyze" the key:
717       String[] tokens = prefix.split(" ");
718       StringBuilder builder = new StringBuilder();
719       boolean lastRemoved = false;
720       for(int i=0;i<tokens.length;i++) {
721         String token = tokens[i];
722         if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(" ")) {
723           builder.append(' ');
724         }
725 
726         if (token.length() == 1 && isStopChar(token.charAt(0), numStopChars)) {
727           if (preserveSep && preserveHoles) {
728             builder.append("\u0000");
729           }
730           lastRemoved = true;
731         } else {
732           builder.append(token);
733           lastRemoved = false;
734         }
735       }
736 
737       String analyzedKey = builder.toString();
738 
739       // Remove trailing sep/holes (TokenStream.end() does
740       // not tell us any trailing holes, yet ... there is an
741       // issue open for this):
742       while (true) {
743         String s = analyzedKey.replaceAll("(^| )\u0000$", "");
744         s = s.replaceAll("\\s+$", "");
745         if (s.equals(analyzedKey)) {
746           break;
747         }
748         analyzedKey = s;
749       }
750 
751       if (analyzedKey.length() == 0) {
752         // Currently suggester can't suggest from the empty
753         // string!  You get no results, not all results...
754         continue;
755       }
756 
757       if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) {
758         analyzedKey += " ";
759       }
760 
761       if (VERBOSE) {
762         System.out.println("  analyzed: " + analyzedKey);
763       }
764       TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton();
765 
766       // NOTE: not great that we ask the suggester to give
767       // us the "answer key" (ie maybe we have a bug in
768       // suggester.toLevA ...) ... but testRandom2() fixes
769       // this:
770       Automaton automaton = suggester.convertAutomaton(suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)));
771       assertTrue(automaton.isDeterministic());
772 
773       // TODO: could be faster... but it's slowCompletor for a reason
774       BytesRefBuilder spare = new BytesRefBuilder();
775       for (TermFreqPayload2 e : slowCompletor) {
776         spare.copyChars(e.analyzedForm);
777         FiniteStringsIterator finiteStrings =
778             new FiniteStringsIterator(suggester.toAutomaton(spare.get(), tokenStreamToAutomaton));
779         for (IntsRef string; (string = finiteStrings.next()) != null;) {
780           int p = 0;
781           BytesRef ref = Util.toBytesRef(string, spare);
782           boolean added = false;
783           for (int i = ref.offset; i < ref.length; i++) {
784             int q = automaton.step(p, ref.bytes[i] & 0xff);
785             if (q == -1) {
786               break;
787             } else if (automaton.isAccept(q)) {
788               matches.add(new LookupResult(e.surfaceForm, e.weight));
789               added = true;
790               break;
791             }
792             p = q;
793           }
794           if (!added && automaton.isAccept(p)) {
795             matches.add(new LookupResult(e.surfaceForm, e.weight));
796           } 
797         }
798       }
799 
800       assertTrue(numStopChars > 0 || matches.size() > 0);
801 
802       if (matches.size() > 1) {
803         Collections.sort(matches, new Comparator<LookupResult>() {
804             @Override
805             public int compare(LookupResult left, LookupResult right) {
806               int cmp = Float.compare(right.value, left.value);
807               if (cmp == 0) {
808                 return left.compareTo(right);
809               } else {
810                 return cmp;
811               }
812             }
813           });
814       }
815 
816       if (matches.size() > topN) {
817         matches = matches.subList(0, topN);
818       }
819 
820       if (VERBOSE) {
821         System.out.println("  expected:");
822         for(LookupResult lr : matches) {
823           System.out.println("    key=" + lr.key + " weight=" + lr.value);
824         }
825 
826         System.out.println("  actual:");
827         for(LookupResult lr : r) {
828           System.out.println("    key=" + lr.key + " weight=" + lr.value);
829         }
830       }
831       
832       assertEquals(prefix + "  " + topN, matches.size(), r.size());
833       for(int hit=0;hit<r.size();hit++) {
834         //System.out.println("  check hit " + hit);
835         assertEquals(prefix + "  " + topN, matches.get(hit).key.toString(), r.get(hit).key.toString());
836         assertEquals(matches.get(hit).value, r.get(hit).value, 0f);
837       }
838     }
839     a.close();
840   }
841 
842   public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception {
843     Analyzer a = new MockAnalyzer(random());
844     FuzzySuggester suggester = new FuzzySuggester(a, a, 0, 2, -1, true, 1, true, 1, 3, false);
845 
846     List<Input> keys = Arrays.asList(new Input[] {
847         new Input("a", 40),
848         new Input("a ", 50),
849         new Input(" a", 60),
850       });
851 
852     Collections.shuffle(keys, random());
853     suggester.build(new InputArrayIterator(keys));
854 
855     List<LookupResult> results = suggester.lookup("a", false, 5);
856     assertEquals(2, results.size());
857     assertEquals(" a", results.get(0).key);
858     assertEquals(60, results.get(0).value);
859     assertEquals("a ", results.get(1).key);
860     assertEquals(50, results.get(1).value);
861     a.close();
862   }
863 
864   public void testEditSeps() throws Exception {
865     Analyzer a = new MockAnalyzer(random());
866     FuzzySuggester suggester = new FuzzySuggester(a, a, FuzzySuggester.PRESERVE_SEP, 2, -1, true, 2, true, 1, 3, false);
867 
868     List<Input> keys = Arrays.asList(new Input[] {
869         new Input("foo bar", 40),
870         new Input("foo bar baz", 50),
871         new Input("barbaz", 60),
872         new Input("barbazfoo", 10),
873       });
874 
875     Collections.shuffle(keys, random());
876     suggester.build(new InputArrayIterator(keys));
877 
878     assertEquals("[foo bar baz/50, foo bar/40]", suggester.lookup("foobar", false, 5).toString());
879     assertEquals("[foo bar baz/50]", suggester.lookup("foobarbaz", false, 5).toString());
880     assertEquals("[barbaz/60, barbazfoo/10]", suggester.lookup("bar baz", false, 5).toString());
881     assertEquals("[barbazfoo/10]", suggester.lookup("bar baz foo", false, 5).toString());
882     a.close();
883   }
884   
885   @SuppressWarnings("fallthrough")
886   private static String addRandomEdit(String string, int prefixLength) {
887     char[] input = string.toCharArray();
888     StringBuilder builder = new StringBuilder();
889     for (int i = 0; i < input.length; i++) {
890       if (i >= prefixLength && random().nextBoolean() && i < input.length-1) {
891         switch(random().nextInt(4)) {
892           case 3:
893             if (i < input.length-1) {
894               // Transpose input[i] and input[1+i]:
895               builder.append(input[i+1]);
896               builder.append(input[i]);
897               for(int j=i+2;j<input.length;j++) {
898                 builder.append(input[j]);
899               }
900               return builder.toString();
901             }
902             // NOTE: fall through to delete:
903           case 2:
904             // Delete input[i]
905             for (int j = i+1; j < input.length; j++) {
906               builder.append(input[j]);  
907             }
908             return builder.toString();
909           case 1:
910             // Insert input[i+1] twice
911             if (i+1<input.length) {
912               builder.append(input[i+1]);
913               builder.append(input[i++]);
914               i++;
915             }
916             for (int j = i; j < input.length; j++) {
917               builder.append(input[j]);
918             }
919             return builder.toString();
920           case 0:
921             // Insert random byte.
922             // NOTE: can only use ascii here so that, in
923             // UTF8 byte space it's still a single
924             // insertion:
925             // bytes 0x1e and 0x1f are reserved
926             int x = random().nextBoolean() ? random().nextInt(30) :  32 + random().nextInt(128 - 32);
927             builder.append((char) x);
928             for (int j = i; j < input.length; j++) {
929               builder.append(input[j]);  
930             }
931             return builder.toString();
932         }
933       }
934 
935       builder.append(input[i]);
936     }
937 
938     return builder.toString();
939   }
940 
941   private String randomSimpleString(int maxLen) {
942     final int len = TestUtil.nextInt(random(), 1, maxLen);
943     final char[] chars = new char[len];
944     for(int j=0;j<len;j++) {
945       chars[j] = (char) ('a' + random().nextInt(4));
946     }
947     return new String(chars);
948   }
949 
950   public void testRandom2() throws Throwable {
951     final int NUM = atLeast(200);
952     final List<Input> answers = new ArrayList<>();
953     final Set<String> seen = new HashSet<>();
954     for(int i=0;i<NUM;i++) {
955       final String s = randomSimpleString(8);
956       if (!seen.contains(s)) {
957         answers.add(new Input(s, random().nextInt(1000)));
958         seen.add(s);
959       }
960     }
961 
962     Collections.sort(answers, new Comparator<Input>() {
963         @Override
964         public int compare(Input a, Input b) {
965           return a.term.compareTo(b.term);
966         }
967       });
968     if (VERBOSE) {
969       System.out.println("\nTEST: targets");
970       for(Input tf : answers) {
971         System.out.println("  " + tf.term.utf8ToString() + " freq=" + tf.v);
972       }
973     }
974 
975     Analyzer a = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
976     int maxEdits = random().nextBoolean() ? 1 : 2;
977     int prefixLen = random().nextInt(4);
978     boolean transpositions = random().nextBoolean();
979     // TODO: test graph analyzers
980     // TODO: test exactFirst / preserveSep permutations
981     FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, true, maxEdits, transpositions, prefixLen, prefixLen, false);
982 
983     if (VERBOSE) {
984       System.out.println("TEST: maxEdits=" + maxEdits + " prefixLen=" + prefixLen + " transpositions=" + transpositions + " num=" + NUM);
985     }
986 
987     Collections.shuffle(answers, random());
988     suggest.build(new InputArrayIterator(answers.toArray(new Input[answers.size()])));
989 
990     final int ITERS = atLeast(100);
991     for(int iter=0;iter<ITERS;iter++) {
992       final String frag = randomSimpleString(6);
993       if (VERBOSE) {
994         System.out.println("\nTEST: iter frag=" + frag);
995       }
996       final List<LookupResult> expected = slowFuzzyMatch(prefixLen, maxEdits, transpositions, answers, frag);
997       if (VERBOSE) {
998         System.out.println("  expected: " + expected.size());
999         for(LookupResult c : expected) {
1000           System.out.println("    " + c);
1001         }
1002       }
1003       final List<LookupResult> actual = suggest.lookup(frag, false, NUM);
1004       if (VERBOSE) {
1005         System.out.println("  actual: " + actual.size());
1006         for(LookupResult c : actual) {
1007           System.out.println("    " + c);
1008         }
1009       }
1010 
1011       Collections.sort(actual, new CompareByCostThenAlpha());
1012 
1013       final int limit = Math.min(expected.size(), actual.size());
1014       for(int ans=0;ans<limit;ans++) {
1015         final LookupResult c0 = expected.get(ans);
1016         final LookupResult c1 = actual.get(ans);
1017         assertEquals("expected " + c0.key +
1018                      " but got " + c1.key,
1019                      0,
1020                      CHARSEQUENCE_COMPARATOR.compare(c0.key, c1.key));
1021         assertEquals(c0.value, c1.value);
1022       }
1023       assertEquals(expected.size(), actual.size());
1024     }
1025     a.close();
1026   }
1027 
1028   private List<LookupResult> slowFuzzyMatch(int prefixLen, int maxEdits, boolean allowTransposition, List<Input> answers, String frag) {
1029     final List<LookupResult> results = new ArrayList<>();
1030     final int fragLen = frag.length();
1031     for(Input tf : answers) {
1032       //System.out.println("  check s=" + tf.term.utf8ToString());
1033       boolean prefixMatches = true;
1034       for(int i=0;i<prefixLen;i++) {
1035         if (i == fragLen) {
1036           // Prefix still matches:
1037           break;
1038         }
1039         if (i == tf.term.length || tf.term.bytes[i] != (byte) frag.charAt(i)) {
1040           prefixMatches = false;
1041           break;
1042         }
1043       }
1044       //System.out.println("    prefixMatches=" + prefixMatches);
1045 
1046       if (prefixMatches) {
1047         final int len = tf.term.length;
1048         if (len >= fragLen-maxEdits) {
1049           // OK it's possible:
1050           //System.out.println("    possible");
1051           int d;
1052           final String s = tf.term.utf8ToString();
1053           if (fragLen == prefixLen) {
1054             d = 0;
1055           } else if (false && len < fragLen) {
1056             d = getDistance(frag, s, allowTransposition);
1057           } else {
1058             //System.out.println("    try loop");
1059             d = maxEdits + 1;
1060             //for(int ed=-maxEdits;ed<=maxEdits;ed++) {
1061             for(int ed=-maxEdits;ed<=maxEdits;ed++) {
1062               if (s.length() < fragLen - ed) {
1063                 continue;
1064               }
1065               String check = s.substring(0, fragLen-ed);
1066               d = getDistance(frag, check, allowTransposition);
1067               //System.out.println("    sub check s=" + check + " d=" + d);
1068               if (d <= maxEdits) {
1069                 break;
1070               }
1071             }
1072           }
1073           if (d <= maxEdits) {
1074             results.add(new LookupResult(tf.term.utf8ToString(), tf.v));
1075           }
1076         }
1077       }
1078 
1079       Collections.sort(results, new CompareByCostThenAlpha());
1080     }
1081 
1082     return results;
1083   }
1084 
1085   private static class CharSequenceComparator implements Comparator<CharSequence> {
1086 
1087     @Override
1088     public int compare(CharSequence o1, CharSequence o2) {
1089       final int l1 = o1.length();
1090       final int l2 = o2.length();
1091       
1092       final int aStop = Math.min(l1, l2);
1093       for (int i = 0; i < aStop; i++) {
1094         int diff = o1.charAt(i) - o2.charAt(i);
1095         if (diff != 0) {
1096           return diff;
1097         }
1098       }
1099       // One is a prefix of the other, or, they are equal:
1100       return l1 - l2;
1101     }
1102   }
1103 
1104   private static final Comparator<CharSequence> CHARSEQUENCE_COMPARATOR = new CharSequenceComparator();
1105 
1106   public class CompareByCostThenAlpha implements Comparator<LookupResult> {
1107     @Override
1108     public int compare(LookupResult a, LookupResult b) {
1109       if (a.value > b.value) {
1110         return -1;
1111       } else if (a.value < b.value) {
1112         return 1;
1113       } else {
1114         final int c = CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
1115         assert c != 0: "term=" + a.key;
1116         return c;
1117       }
1118     }
1119   }
1120 
1121   // NOTE: copied from
1122   // modules/suggest/src/java/org/apache/lucene/search/spell/LuceneLevenshteinDistance.java
1123   // and tweaked to return the edit distance not the float
1124   // lucene measure
1125 
1126   /* Finds unicode (code point) Levenstein (edit) distance
1127    * between two strings, including transpositions. */
1128   public int getDistance(String target, String other, boolean allowTransposition) {
1129     IntsRef targetPoints;
1130     IntsRef otherPoints;
1131     int n;
1132     int d[][]; // cost array
1133     
1134     // NOTE: if we cared, we could 3*m space instead of m*n space, similar to 
1135     // what LevenshteinDistance does, except cycling thru a ring of three 
1136     // horizontal cost arrays... but this comparator is never actually used by 
1137     // DirectSpellChecker, it's only used for merging results from multiple shards 
1138     // in "distributed spellcheck", and it's inefficient in other ways too...
1139 
1140     // cheaper to do this up front once
1141     targetPoints = toIntsRef(target);
1142     otherPoints = toIntsRef(other);
1143     n = targetPoints.length;
1144     final int m = otherPoints.length;
1145     d = new int[n+1][m+1];
1146     
1147     if (n == 0 || m == 0) {
1148       if (n == m) {
1149         return 0;
1150       }
1151       else {
1152         return Math.max(n, m);
1153       }
1154     } 
1155 
1156     // indexes into strings s and t
1157     int i; // iterates through s
1158     int j; // iterates through t
1159 
1160     int t_j; // jth character of t
1161 
1162     int cost; // cost
1163 
1164     for (i = 0; i<=n; i++) {
1165       d[i][0] = i;
1166     }
1167     
1168     for (j = 0; j<=m; j++) {
1169       d[0][j] = j;
1170     }
1171 
1172     for (j = 1; j<=m; j++) {
1173       t_j = otherPoints.ints[j-1];
1174 
1175       for (i=1; i<=n; i++) {
1176         cost = targetPoints.ints[i-1]==t_j ? 0 : 1;
1177         // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
1178         d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);
1179         // transposition
1180         if (allowTransposition && i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) {
1181           d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);
1182         }
1183       }
1184     }
1185     
1186     return d[n][m];
1187   }
1188   
1189   private static IntsRef toIntsRef(String s) {
1190     IntsRef ref = new IntsRef(s.length()); // worst case
1191     int utf16Len = s.length();
1192     for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {
1193       cp = ref.ints[ref.length++] = Character.codePointAt(s, i);
1194     }
1195     return ref;
1196   }
1197 }